Netflix EDA¶
InĀ [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from datetime import datetime,timedelta
from itertools import combinations, islice
from collections import Counter
import operator
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
Import the dataset downloaded from kaggle
InĀ [2]:
df = pd.read_csv('netflix-rotten-tomatoes-metacritic-imdb.csv')
Dataset exploration
InĀ [3]:
df.head()
Out[3]:
| Title | Genre | Tags | Languages | Series or Movie | Hidden Gem Score | Country Availability | Runtime | Director | Writer | Actors | View Rating | IMDb Score | Rotten Tomatoes Score | Metacritic Score | Awards Received | Awards Nominated For | Boxoffice | Release Date | Netflix Release Date | Production House | Netflix Link | IMDb Link | Summary | IMDb Votes | Image | Poster | TMDb Trailer | Trailer Site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lets Fight Ghost | Crime, Drama, Fantasy, Horror, Romance | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | KƄre Hedebrant, Per Ragnar, Lina Leandersson, ... | R | 7.9 | 98.0 | 82.0 | 74.0 | 57.0 | $2,122,065 | 12 Dec 2008 | 2021-03-04 | Canal+, Sandrew Metronome | https://www.netflix.com/watch/81415947 | https://www.imdb.com/title/tt1139797 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | https://m.media-amazon.com/images/M/MV5BOWM4NT... | NaN | NaN |
| 1 | HOW TO BUILD A GIRL | Comedy | Dramas,Comedies,Films Based on Books,British | English | Movie | 7.0 | Canada | 1-2 hour | Coky Giedroyc | Caitlin Moran | Paddy Considine, Cleo, Beanie Feldstein, Dónal... | R | 5.8 | 79.0 | 69.0 | 1.0 | NaN | $70,632 | 08 May 2020 | 2021-03-04 | Film 4, Monumental Pictures, Lionsgate | https://www.netflix.com/watch/81041267 | https://www.imdb.com/title/tt4193072 | When nerdy Johanna moves to London, things get... | 2838.0 | https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... | https://m.media-amazon.com/images/M/MV5BZGUyN2... | https://www.youtube.com/watch?v=eIbcxPy4okQ | YouTube |
| 2 | Centigrade | Drama, Thriller | Thrillers | English | Movie | 6.4 | Canada | 1-2 hour | Brendan Walsh | Brendan Walsh, Daley Nixon | Genesis Rodriguez, Vincent Piazza | Unrated | 4.3 | NaN | 46.0 | NaN | NaN | $16,263 | 28 Aug 2020 | 2021-03-04 | NaN | https://www.netflix.com/watch/81305978 | https://www.imdb.com/title/tt8945942 | Trapped in a frozen car during a blizzard, a p... | 1720.0 | https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... | https://m.media-amazon.com/images/M/MV5BODM2MD... | https://www.youtube.com/watch?v=0RvV7TNUlkQ | YouTube |
| 3 | ANNE+ | Drama | TV Dramas,Romantic TV Dramas,Dutch TV Shows | Turkish | Series | 7.7 | Belgium,Netherlands | < 30 minutes | NaN | NaN | Vahide PerƧin, Gonca Vuslateri, Cansu Dere, Be... | NaN | 6.5 | NaN | NaN | 1.0 | NaN | NaN | 01 Oct 2016 | 2021-03-04 | NaN | https://www.netflix.com/watch/81336456 | https://www.imdb.com/title/tt6132758 | Upon moving into a new place, a 20-something r... | 1147.0 | https://occ-0-1489-1490.1.nflxso.net/dnm/api/v... | https://m.media-amazon.com/images/M/MV5BNWRkMz... | NaN | NaN |
| 4 | Moxie | Animation, Short, Drama | Social Issue Dramas,Teen Movies,Dramas,Comedie... | English | Movie | 8.1 | Lithuania,Poland,France,Iceland,Italy,Spain,Gr... | 1-2 hour | Stephen Irwin | NaN | Ragga Gudrun | NaN | 6.3 | NaN | NaN | NaN | 4.0 | NaN | 22 Sep 2011 | 2021-03-04 | NaN | https://www.netflix.com/watch/81078393 | https://www.imdb.com/title/tt2023611 | Inspired by her moms rebellious past and a con... | 63.0 | https://occ-0-4039-1500.1.nflxso.net/dnm/api/v... | https://m.media-amazon.com/images/M/MV5BODYyNW... | NaN | NaN |
InĀ [4]:
df.shape
Out[4]:
(15480, 29)
InĀ [5]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15480 entries, 0 to 15479 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 15480 non-null object 1 Genre 13770 non-null object 2 Tags 15413 non-null object 3 Languages 13545 non-null object 4 Series or Movie 15480 non-null object 5 Hidden Gem Score 13379 non-null float64 6 Country Availability 15461 non-null object 7 Runtime 15479 non-null object 8 Director 10772 non-null object 9 Writer 11150 non-null object 10 Actors 13555 non-null object 11 View Rating 8456 non-null object 12 IMDb Score 13381 non-null float64 13 Rotten Tomatoes Score 6382 non-null float64 14 Metacritic Score 4336 non-null float64 15 Awards Received 6075 non-null float64 16 Awards Nominated For 7661 non-null float64 17 Boxoffice 4007 non-null object 18 Release Date 13373 non-null object 19 Netflix Release Date 15480 non-null object 20 Production House 5149 non-null object 21 Netflix Link 15480 non-null object 22 IMDb Link 13177 non-null object 23 Summary 15471 non-null object 24 IMDb Votes 13379 non-null float64 25 Image 15480 non-null object 26 Poster 11842 non-null object 27 TMDb Trailer 7194 non-null object 28 Trailer Site 7194 non-null object dtypes: float64(7), object(22) memory usage: 3.4+ MB
InĀ [6]:
df.describe().T
Out[6]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Hidden Gem Score | 13379.0 | 5.937551 | 2.250202 | 0.6 | 3.8 | 6.8 | 7.9 | 9.8 |
| IMDb Score | 13381.0 | 6.496054 | 1.146910 | 1.0 | 5.8 | 6.6 | 7.3 | 9.7 |
| Rotten Tomatoes Score | 6382.0 | 59.523034 | 26.999173 | 0.0 | 38.0 | 64.0 | 83.0 | 100.0 |
| Metacritic Score | 4336.0 | 56.813653 | 17.582545 | 5.0 | 44.0 | 57.0 | 70.0 | 100.0 |
| Awards Received | 6075.0 | 8.764444 | 18.311171 | 1.0 | 1.0 | 3.0 | 8.0 | 300.0 |
| Awards Nominated For | 7661.0 | 13.983161 | 29.821052 | 1.0 | 2.0 | 5.0 | 12.0 | 386.0 |
| IMDb Votes | 13379.0 | 42728.411615 | 125701.191329 | 5.0 | 403.5 | 2322.0 | 20890.5 | 2354197.0 |
Ratings evaluation between different platforms
InĀ [7]:
# Create a dataframe containing scores from the platforms scaling IMDB to 100
scores_df = pd.DataFrame({'imdb_scores': df['IMDb Score']*10,
'rt_scores': df['Rotten Tomatoes Score'],
'mc_scores': df['Metacritic Score']})
InĀ [8]:
# Different Platfomrms Scores densities
plt.figure(figsize=(12,5), dpi=150)
sns.kdeplot(data=scores_df, x='imdb_scores', label='IMDb Score', fill=True)
sns.kdeplot(data=scores_df, x='rt_scores', label= 'Rotten Tomatoes Score', fill=True)
sns.kdeplot(data=scores_df, x='mc_scores', label= 'Metacritic Score', fill=True)
plt.legend(loc=('upper left'))
plt.xlim(0,100)
plt.title('IMDb vs Rotten Tomatoes vs Metacritic score densities')
plt.xlabel('Score %')
plt.show()
Columns selection
InĀ [9]:
selected_columns = ['Title', 'Genre', 'Languages', 'Series or Movie', 'Hidden Gem Score', 'Runtime',
'Director', 'Actors', 'IMDb Score', 'IMDb Votes', 'Awards Nominated For', 'Awards Received',
'Boxoffice', 'Netflix Release Date', 'Country Availability']
df_selected = df[selected_columns].copy()
df_selected.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15480 entries, 0 to 15479 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 15480 non-null object 1 Genre 13770 non-null object 2 Languages 13545 non-null object 3 Series or Movie 15480 non-null object 4 Hidden Gem Score 13379 non-null float64 5 Runtime 15479 non-null object 6 Director 10772 non-null object 7 Actors 13555 non-null object 8 IMDb Score 13381 non-null float64 9 IMDb Votes 13379 non-null float64 10 Awards Nominated For 7661 non-null float64 11 Awards Received 6075 non-null float64 12 Boxoffice 4007 non-null object 13 Netflix Release Date 15480 non-null object 14 Country Availability 15461 non-null object dtypes: float64(5), object(10) memory usage: 1.8+ MB
InĀ [10]:
df_selected.shape
Out[10]:
(15480, 15)
Dates maniplulation
InĀ [11]:
# Convert the 'Netflix Release Date' column to a datetime format
df_selected['Netflix Release Date'] = pd.to_datetime(df_selected['Netflix Release Date'])
# Extract year and month info
df_selected['Year'] = df_selected['Netflix Release Date'].dt.year
df_selected['Month'] = df_selected['Netflix Release Date'].dt.month
InĀ [12]:
df_selected.dtypes
Out[12]:
Title object Genre object Languages object Series or Movie object Hidden Gem Score float64 Runtime object Director object Actors object IMDb Score float64 IMDb Votes float64 Awards Nominated For float64 Awards Received float64 Boxoffice object Netflix Release Date datetime64[ns] Country Availability object Year int64 Month int64 dtype: object
Top Genres¶
Create a dataset to work with genres
InĀ [13]:
df_gen = df_selected.copy()
# Drop rows with missing values in the 'Genre', 'IMDb Score', and 'IMDb Votes' columns
df_gen.dropna(subset=['Genre','IMDb Score','IMDb Votes'],inplace=True)
InĀ [14]:
# Manipulate 'Genre' column and explode into separate rows
df_gen['Genre'] = df['Genre'].astype("string")
df_gen['Genre']=df_gen['Genre'].map(lambda x:x.split(','))
df_gen =df_gen.explode('Genre').reset_index(drop=True)
df_gen['Genre'] = df_gen['Genre'].str.strip()
InĀ [15]:
# Filter the dataframe to include only rows with 'IMDb Score' greater than the mean 'IMDb Score'
df_gen = df_gen[df_gen['IMDb Score'] > df_gen['IMDb Score'].mean()]
# Create a new column 'mul_rating' as a weighted average of 'IMDb Score' and 'IMDb Votes'
df_gen['mul_rating'] =((0.2*df_gen['IMDb Score'] + 0.8*df_gen['IMDb Votes'])/((0.2*df_gen['IMDb Score'] + 0.8*df_gen['IMDb Votes']).max()))*100
InĀ [16]:
df_gen.describe().T
Out[16]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Hidden Gem Score | 18689.0 | 5.884280 | 2.162107 | 1.700000 | 3.900000 | 5.80000 | 8.100000 | 9.8 |
| IMDb Score | 18689.0 | 7.381791 | 0.567072 | 6.600000 | 6.900000 | 7.30000 | 7.800000 | 9.7 |
| IMDb Votes | 18689.0 | 71812.519557 | 168118.728762 | 5.000000 | 991.000000 | 5753.00000 | 59010.000000 | 2354197.0 |
| Awards Nominated For | 12899.0 | 20.844872 | 38.008842 | 1.000000 | 3.000000 | 8.00000 | 21.000000 | 386.0 |
| Awards Received | 11022.0 | 12.107149 | 22.572374 | 1.000000 | 2.000000 | 5.00000 | 12.000000 | 300.0 |
| Year | 18689.0 | 2017.387394 | 1.976495 | 2015.000000 | 2015.000000 | 2017.00000 | 2019.000000 | 2021.0 |
| Month | 18689.0 | 6.496228 | 3.430340 | 1.000000 | 4.000000 | 6.00000 | 10.000000 | 12.0 |
| mul_rating | 18689.0 | 3.050479 | 7.141229 | 0.000282 | 0.042165 | 0.24445 | 2.506655 | 100.0 |
InĀ [17]:
# Group the dataframe by 'Genre' and compute average 'IMDb Score', total 'IMDb Votes', and mean 'mul_rating'
df_gen_plot = df_gen.groupby('Genre').agg({'IMDb Score': 'mean',
'IMDb Votes': 'sum',
'mul_rating': 'mean'}).reset_index().sort_values('mul_rating', ascending=False)
# Create a copy of the dataframe with min-max normalized 'mul_rating'
normalized_df = df_gen_plot.copy()
normalized_df['mul_rating'] = (df_gen_plot['mul_rating'] - df_gen_plot['mul_rating'].min()) / (df_gen_plot['mul_rating'].max() - df_gen_plot['mul_rating'].min()) * 10
Plot Genres vs Rating
InĀ [18]:
# Create a bar plot of the normalized ratings for each genre
plt.figure(figsize=(12,5), dpi=150)
sns.barplot(data= normalized_df, x='Genre', y='mul_rating', palette='rocket')
plt.title('Genres Rating')
plt.ylabel('Rating')
plt.xlabel('Genre')
plt.xticks(rotation = 45, ha='right')
plt.show()
Popular Genres per Year¶
InĀ [19]:
# Grouping the data by year and genre to get the number of titles per genre per year
genre_per_year = df_gen.groupby(['Year', 'Genre'])['Year'].count().reset_index(name='Count of Genres per Year')
# Sorting the data by year and count of genres per year in descending order
genre_per_year = genre_per_year.sort_values(['Year', 'Count of Genres per Year'], ascending=False)
# Selecting the data for years between 2016 and 2020
genre_per_year = genre_per_year[(genre_per_year['Year'] >= 2016) & (genre_per_year['Year'] <= 2020)]
# Selecting the top 5 genres for each year
top_genres_per_year = genre_per_year.groupby('Year').head(5).reset_index(drop=True)
InĀ [20]:
# Creating a bar plot to show the top 5 genres for each year
plt.figure(figsize=(12, 5))
palette = 'flare'
sns.barplot(data=top_genres_per_year, x='Year', y='Count of Genres per Year', hue='Genre', palette=palette, dodge=False)
plt.ylim(0, 900)
plt.title('Top 5 Genres per Year')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.legend(loc='center right', bbox_to_anchor=(1.3, 0.5))
plt.tight_layout()
plt.show()
Yearly series and movies production¶
InĀ [21]:
# Group the data by year and series/movie
df_prod = df_selected.groupby(['Year', 'Series or Movie']).agg({'Title': 'count'}).reset_index()
# Filter the data to only include years between 2016 and 2020
df_prod = df_prod[(df_prod['Year'] >= 2016) & (df_prod['Year'] <= 2020)]
InĀ [22]:
# Creating a bar plot to show the number of Movie and Series titles each year
sns.barplot(data=df_prod, x="Year", y="Title", hue="Series or Movie", palette='rocket')
plt.title('Number of Netflix Titles Released per Year')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.legend(title='', loc='upper left')
plt.show()
Acting Duos¶
InĀ [23]:
# Select relevant columns and rows with valid actors
df_act = df_selected[['Title', 'Actors']].dropna()
df_act['Actors'] = df_act['Actors'].astype(str).str.split(',')
# Create list of actor pairs and count occurrences using Counter
comblist = []
for actors in df_act['Actors']:
comb = combinations(actors, 2)
for pair in comb:
comblist.append(pair)
c = Counter(comblist)
# Get top 15 pairs and plot a scatter plot
pairs = c.most_common(15)
pairs1 = np.array(pairs)
lispa1 = list(map(lambda x: list(x[0]), pairs))
lispa2 = list(map(lambda x: x[1], pairs))
b = list(map(lambda x: '-'.join(x), lispa1))
InĀ [24]:
#Counting the number of times pairs of actors have appeared in movies together
plt.figure(figsize=(12, 5), dpi=150)
plt.scatter(b, lispa2, alpha=0.8, s=100, edgecolors='black', c='firebrick')
plt.xticks(rotation=45, ha='right')
plt.title('Most Dynamic Acting Duos', fontsize=16)
plt.xlabel('Acting Duos', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.tight_layout()
plt.show()
Top Directors¶
InĀ [25]:
# Extract directors data from the DataFrame
df_dir = df[df['Director'].notna()]
df_dir['Director'] = df_dir['Director'].astype("string")
df_dir['Director']=df_dir['Director'].map(lambda x:x.split(','))
df_dir = df_dir.explode('Director').reset_index(drop=True)
# Count the number of movies each director has directed
df_dir['Director_Count'] = df_dir['Director'].map(df_dir['Director'].value_counts())
# Aggregate the data based on director and calculate relevant metrics
df_dir1 = df_dir.groupby('Director').agg({'Hidden Gem Score':np.mean,
'IMDb Score':np.mean,
'IMDb Votes':np.sum,
'Director_Count':pd.Series.count,
'Awards Received':np.sum}).reset_index()
# Filter the data to only include directors with above-average IMDb Votes, Director_Count, and Awards Received, and sort by IMDb Score
df_dir2 = df_dir1[(df_dir1['IMDb Votes'] > df_dir1['IMDb Votes'].mean())
& (df_dir1['Director_Count'] > df_dir1['Director_Count'].mean())
& (df_dir1['Awards Received'] > df_dir1['Awards Received'].mean())].sort_values(['IMDb Score',
'IMDb Votes'], ascending=False)[['Director','IMDb Score']].head(10)
Scatterplot
InĀ [26]:
# Plot a scatterplot of IMDb Score vs. Director for the top 10 directors
plt.figure(figsize=(12, 5), dpi=150)
plt.scatter(x=df_dir2['Director'], y=df_dir2['IMDb Score'], alpha=0.8, s=100, edgecolors='black', c='red')
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
plt.title('Top Directors according to IMDb Score', fontsize=16, color='white')
plt.xlabel('Director', fontsize=14)
plt.ylabel('IMDb Score', fontsize=14)
plt.tight_layout()
plt.show()
Barplot
InĀ [27]:
fig = px.bar(df_dir2, x='Director', y='IMDb Score', color='IMDb Score',
color_continuous_scale=px.colors.sequential.Reds)
fig.update_layout(title='Top Directors according to IMDb Score',
xaxis_title='Director', yaxis_title='IMDb Score',
xaxis_tickangle=-45)
fig.show()
Top Genre-Diverse Actors¶
InĀ [28]:
# Creating a dataframe to search for genre diverse actors
df_act = df[df['Actors'].notna()]
df_act['Actors'] = df_act['Actors'].astype("string")
df_act['Actors']=df_act['Actors'].map(lambda x:x.split(','))
df_act=df_act.explode('Actors').reset_index(drop=True)
df_act
df_act = df_act[df_act['Genre'].notna()]
df_act['Genre'] = df_act['Genre'].astype("string")
df_act['Genre']=df_act['Genre'].map(lambda x:x.split(','))
df_act=df_act.explode('Genre').reset_index(drop=True)
df_act['Genre'] = df_act['Genre'].str.strip()
df_act.head()
Out[28]:
| Title | Genre | Tags | Languages | Series or Movie | Hidden Gem Score | Country Availability | Runtime | Director | Writer | Actors | View Rating | IMDb Score | Rotten Tomatoes Score | Metacritic Score | Awards Received | Awards Nominated For | Boxoffice | Release Date | Netflix Release Date | Production House | Netflix Link | IMDb Link | Summary | IMDb Votes | Image | Poster | TMDb Trailer | Trailer Site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lets Fight Ghost | Crime | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | KƄre Hedebrant | R | 7.9 | 98.0 | 82.0 | 74.0 | 57.0 | $2,122,065 | 12 Dec 2008 | 2021-03-04 | Canal+, Sandrew Metronome | https://www.netflix.com/watch/81415947 | https://www.imdb.com/title/tt1139797 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | https://m.media-amazon.com/images/M/MV5BOWM4NT... | NaN | NaN |
| 1 | Lets Fight Ghost | Drama | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | KƄre Hedebrant | R | 7.9 | 98.0 | 82.0 | 74.0 | 57.0 | $2,122,065 | 12 Dec 2008 | 2021-03-04 | Canal+, Sandrew Metronome | https://www.netflix.com/watch/81415947 | https://www.imdb.com/title/tt1139797 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | https://m.media-amazon.com/images/M/MV5BOWM4NT... | NaN | NaN |
| 2 | Lets Fight Ghost | Fantasy | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | KƄre Hedebrant | R | 7.9 | 98.0 | 82.0 | 74.0 | 57.0 | $2,122,065 | 12 Dec 2008 | 2021-03-04 | Canal+, Sandrew Metronome | https://www.netflix.com/watch/81415947 | https://www.imdb.com/title/tt1139797 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | https://m.media-amazon.com/images/M/MV5BOWM4NT... | NaN | NaN |
| 3 | Lets Fight Ghost | Horror | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | KƄre Hedebrant | R | 7.9 | 98.0 | 82.0 | 74.0 | 57.0 | $2,122,065 | 12 Dec 2008 | 2021-03-04 | Canal+, Sandrew Metronome | https://www.netflix.com/watch/81415947 | https://www.imdb.com/title/tt1139797 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | https://m.media-amazon.com/images/M/MV5BOWM4NT... | NaN | NaN |
| 4 | Lets Fight Ghost | Romance | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | KƄre Hedebrant | R | 7.9 | 98.0 | 82.0 | 74.0 | 57.0 | $2,122,065 | 12 Dec 2008 | 2021-03-04 | Canal+, Sandrew Metronome | https://www.netflix.com/watch/81415947 | https://www.imdb.com/title/tt1139797 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | https://m.media-amazon.com/images/M/MV5BOWM4NT... | NaN | NaN |
InĀ [29]:
#Top Genre-Diverse Actors
df_act2=df_act.groupby('Actors').nunique()[['Genre','Title']].reset_index()
df_act_div = df_act2.sort_values('Genre',ascending=False).head(10)
Barplot
InĀ [30]:
sns.barplot(data=df_act_div, x="Actors", y="Genre", palette='flare')
plt.title('Top Genre-Diverse Actors')
plt.xticks(rotation = 45, ha='right')
plt.xlabel('Actors')
plt.ylabel('Genres')
plt.show()
Top Genre-Specific Actors¶
InĀ [31]:
# Top genre-specific actors
df_act2['Difference'] = df_act2['Title']-df_act2['Genre']
df_act_spec = df_act2.sort_values('Difference',ascending=False).head(10)
InĀ [32]:
#Plot a barplot for Genre-Specific Actors
f, ax = plt.subplots(figsize=(6, 6))
# Plot the total titles for each actor
sns.set_color_codes("pastel")
sns.barplot(x="Title", y="Actors", data=df_act_spec,
label="Title", color='r')
# Plot the corresponding genres
sns.set_color_codes("muted")
sns.barplot(x="Genre", y="Actors", data=df_act_spec,
label="Genre", color='r')
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 24), ylabel="",
xlabel="Top Genre-Specific Actors")
sns.despine(left=True, bottom=True)
Box Office¶
InĀ [33]:
# Creating a dataframe for Box Office
df_boxoffice = df_gen.dropna(subset='Boxoffice')
InĀ [34]:
df_boxoffice['Boxoffice'] = df_boxoffice['Boxoffice'].replace(r'[\$,]','',regex=True)
df_boxoffice['Boxoffice'] = pd.to_numeric(df_boxoffice['Boxoffice'])
InĀ [35]:
# Creating two dataframes for Series and Movies Box Office
df_boxoffice_series = df_boxoffice[df_boxoffice['Series or Movie'] == 'Series'].groupby(['Genre']).agg({'Boxoffice' : np.sum}).sort_values('Boxoffice', ascending=False).reset_index().head(10)
df_boxoffice_movies = df_boxoffice[df_boxoffice['Series or Movie'] == 'Movie'].groupby(['Genre']).agg({'Boxoffice' : np.sum}).sort_values('Boxoffice', ascending=False).reset_index().head(10)
InĀ [36]:
# Plot a Barplot for Series Box Office by Genre
sns.barplot(data=df_boxoffice_series, x="Genre", y="Boxoffice", palette='flare')
plt.xticks(rotation = 45, ha='right')
plt.title('Series Box Office by Genre')
plt.ylabel('Box Office in million $')
plt.show()
InĀ [37]:
# Plot a Barplot for Movies Box Office by Genre
sns.barplot(data=df_boxoffice_movies, x="Genre", y="Boxoffice", palette='flare')
plt.xticks(rotation = 45, ha='right')
plt.title('Movies Box Office by Genre')
plt.ylabel('Box Office in 10 milion $')
plt.savefig('figure1.png', transparent=True,bbox_inches='tight') # save as png
plt.show()